{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['this', 'aord', 'with', 'and']\n",
      "['is', 'count', 'a', None]\n",
      "['\\x0f', 'test', 'few', None]\n",
      "[None, None, 'word', None]\n",
      "[None, None, 'repeats', 'words']\n"
     ]
    }
   ],
   "source": [
    "import cudf\n",
    "import nvstrings\n",
    "import rmm\n",
    "import numpy as np\n",
    "\n",
    "lines = [\n",
    " 'this IS a',\n",
    " 'word COUNT TEST',\n",
    " 'with a few word repeats',\n",
    " 'and some junk filler words'\n",
    "]\n",
    "\n",
    "test_strings = nvstrings.to_device(lines)\n",
    "\n",
    "# lowercase all words\n",
    "tmp = test_strings.lower()\n",
    "\n",
    "# remove stop words\n",
    "for word in ['some', 'junk', 'filler']:\n",
    "    tmp = tmp.replace(word, '')\n",
    "    \n",
    "# Split strings into word columns\n",
    "cols = tmp.split_column(' ')\n",
    "# Note the 'None' values for lines with fewer words\n",
    "for col in cols: print(col)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  words_0 words_1 words_2 words_3 words_4\n",
      "0 3559070    3370      15       0       0\n"
     ]
    }
   ],
   "source": [
    "# Convert the nvstrings to a PyGDF DataFrame, one hashed word per row\n",
    "test_df = cudf.dataframe.DataFrame()\n",
    "\n",
    "for idx, col in enumerate(cols):\n",
    "    word_array = rmm.device_array(col.size(), dtype=np.uint32)\n",
    "    col.hash(word_array.device_ctypes_pointer.value)\n",
    "    test_df['words_'+str(idx)] = cudf.Series(word_array)\n",
    "\n",
    "# Note the '0' values, corresponding to the 'None' above\n",
    "# Original string: \"this is a\"\n",
    "print(test_df.head(1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   word_hash\n",
      " 0   3559070\n",
      " 1   3000032\n",
      " 2   3649734\n",
      " 3     96727\n",
      " 0      3370\n",
      " 1  94851343\n",
      " 2        97\n",
      " 3         0\n",
      " 0        15\n",
      " 1   3556498\n",
      "[10 more rows]\n"
     ]
    }
   ],
   "source": [
    "# Create a new DF with one entry per word hash\n",
    "test_words_df = cudf.dataframe.DataFrame()\n",
    "test_words_df['word_hash'] = cudf.multi.concat([test_df[col] for col in list(test_df.columns)])\n",
    "test_words_df['word_hash'] = test_words_df['word_hash'].astype('int32')\n",
    "print(test_words_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Remember, this df still contains one entry per 'None'\n",
    "len(test_words_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       \n",
      "15    1\n",
      "97    1\n",
      "3370    1\n",
      "96727    1\n",
      "101272    1\n",
      "3000032    1\n",
      "3556498    1\n",
      "3559070    1\n",
      "3649734    1\n",
      "3655434    1\n",
      "[3 more rows]\n"
     ]
    }
   ],
   "source": [
    "# Filter nulls, and print GPU accelerated word count results\n",
    "filtered_words_df = test_words_df.query('word_hash != 0')\n",
    "print(filtered_words_df['word_hash'].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now lets do the same thing, but with a LOT of words from 9 million tweets\n",
    "# Obtain from https://about.twitter.com/en_us/values/elections-integrity.html#data\n",
    "\n",
    "# nvstrings has a temporary convenience function for grabbing a single column from a CSV file\n",
    "# in the data linked above, column 12 is the text of the actual tweets\n",
    "%time tweets = nvstrings.from_csv('/raid/tweets/ira_tweets_csv_hashed.csv', 12)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stop words from https://gist.github.com/sebleier/554280\n",
    "import urllib.request\n",
    "url = \"https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%2520list%2520of%2520english%2520stopwords\"\n",
    "\n",
    "response = urllib.request.urlopen(url)\n",
    "stop_words = response.read().decode('utf-8').split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tweets = tweets.lower()\n",
    "\n",
    "for word in stop_words:\n",
    "    tweets = tweets.replace(word, '')\n",
    "cols = tweets.split_column(' ')\n",
    "\n",
    "df = cudf.dataframe.DataFrame()\n",
    "\n",
    "for idx, col in enumerate(cols):\n",
    "    word_array = rmm.device_array(col.size(), dtype=np.uint32)\n",
    "    col.hash(word_array.device_ctypes_pointer.value)\n",
    "    df['words_'+str(idx)] = cudf.Series(word_array)\n",
    "    col.free()\n",
    "\n",
    "cols.free()\n",
    "tweets.free()\n",
    "\n",
    "# Remember, split_column gives 1 column per word. 121 columns means the \"wordiest\" tweet had 121 spaces\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "ename": "CudaAPIError",
     "evalue": "[1] Call to cuMemcpyDtoD results in CUDA_ERROR_INVALID_VALUE",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mCudaAPIError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-8-fc1400580bf1>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf_words\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'word_hash'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcudf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmulti\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcol\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0.2.0-py3.5.egg/cudf/multi.py\u001b[0m in \u001b[0;36mconcat\u001b[0;34m(objs, ignore_index)\u001b[0m\n\u001b[1;32m     34\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_concat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_index\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_index\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     35\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mtyp\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 36\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mSeries\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_concat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     37\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0missubclass\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtyp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     38\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_concat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobjs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0.2.0-py3.5.egg/cudf/series.py\u001b[0m in \u001b[0;36m_concat\u001b[0;34m(cls, objs, index)\u001b[0m\n\u001b[1;32m    391\u001b[0m         \u001b[0;31m# Concatenate index if not provided\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    392\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 393\u001b[0;31m             \u001b[0mindex\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_concat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mobjs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    394\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    395\u001b[0m         \u001b[0mnames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mobjs\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0.2.0-py3.5.egg/cudf/index.py\u001b[0m in \u001b[0;36m_concat\u001b[0;34m(cls, objs)\u001b[0m\n\u001b[1;32m     83\u001b[0m     \u001b[0;34m@\u001b[0m\u001b[0mclassmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     84\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_concat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobjs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 85\u001b[0;31m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mColumn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_concat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mobjs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     86\u001b[0m         \u001b[0;31m# TODO: add ability to concatenate indices without always casting to\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     87\u001b[0m         \u001b[0;31m# `GenericIndex`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0.2.0-py3.5.egg/cudf/column.py\u001b[0m in \u001b[0;36m_concat\u001b[0;34m(cls, objs)\u001b[0m\n\u001b[1;32m     51\u001b[0m         \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mBuffer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_empty\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     52\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mo\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mobjs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 53\u001b[0;31m             \u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_gpu_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     54\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     55\u001b[0m         \u001b[0;31m# Concatenate mask if present\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/cudf-0.2.0-py3.5.egg/cudf/buffer.py\u001b[0m in \u001b[0;36mextend\u001b[0;34m(self, array)\u001b[0m\n\u001b[1;32m    158\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sentry_capacity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mneeded\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    159\u001b[0m         \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcudautils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmem\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy_to_device\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    161\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mneeded\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/numba/cuda/cudadrv/devices.py\u001b[0m in \u001b[0;36m_require_cuda_context\u001b[0;34m(*args, **kws)\u001b[0m\n\u001b[1;32m    210\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_require_cuda_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkws\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    211\u001b[0m         \u001b[0mget_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 212\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkws\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    213\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    214\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0m_require_cuda_context\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/numba/cuda/cudadrv/devicearray.py\u001b[0m in \u001b[0;36mcopy_to_device\u001b[0;34m(self, ary, stream)\u001b[0m\n\u001b[1;32m    192\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    193\u001b[0m             \u001b[0msz\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malloc_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mary\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0malloc_size\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 194\u001b[0;31m             \u001b[0m_driver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice_to_device\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msz\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    195\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    196\u001b[0m             \u001b[0;31m# Ensure same contiguous-nous. Only copies (host-side)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36mdevice_to_device\u001b[0;34m(dst, src, size, stream)\u001b[0m\n\u001b[1;32m   1835\u001b[0m         \u001b[0mfn\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdriver\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcuMemcpyDtoD\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1836\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1837\u001b[0;31m     \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice_pointer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdst\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevice_pointer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0mvarargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1838\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1839\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36msafe_cuda_api_call\u001b[0;34m(*args)\u001b[0m\n\u001b[1;32m    288\u001b[0m             \u001b[0m_logger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdebug\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'call driver api: %s'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibfn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    289\u001b[0m             \u001b[0mretcode\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlibfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 290\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mretcode\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    291\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0msafe_cuda_api_call\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    292\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/conda/envs/cudf/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py\u001b[0m in \u001b[0;36m_check_error\u001b[0;34m(self, fname, retcode)\u001b[0m\n\u001b[1;32m    323\u001b[0m                     \u001b[0m_logger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcritical\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_getpid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    324\u001b[0m                     \u001b[0;32mraise\u001b[0m \u001b[0mCudaDriverError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"CUDA initialized before forking\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mCudaAPIError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mretcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    326\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    327\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget_device\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdevnum\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mCudaAPIError\u001b[0m: [1] Call to cuMemcpyDtoD results in CUDA_ERROR_INVALID_VALUE"
     ]
    }
   ],
   "source": [
    "df_words['word_hash'] = cudf.multi.concat([df[col] for col in list(df.columns)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter null words\n",
    "%time len(df_words.query('word_hash != 0'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_words['word_hash'].unique_count()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
